In [1]:
#import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.simplefilter("ignore")
In [2]:
#import datafile
vehicle=pd.read_csv("vehicle-1.csv")
vehicle.head()
Out[2]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
0 95 48.0 83.0 178.0 72.0 10 162.0 42.0 20.0 159 176.0 379.0 184.0 70.0 6.0 16.0 187.0 197 van
1 91 41.0 84.0 141.0 57.0 9 149.0 45.0 19.0 143 170.0 330.0 158.0 72.0 9.0 14.0 189.0 199 van
2 104 50.0 106.0 209.0 66.0 10 207.0 32.0 23.0 158 223.0 635.0 220.0 73.0 14.0 9.0 188.0 196 car
3 93 41.0 82.0 159.0 63.0 9 144.0 46.0 19.0 143 160.0 309.0 127.0 63.0 6.0 10.0 199.0 207 van
4 85 44.0 70.0 205.0 103.0 52 149.0 45.0 19.0 144 241.0 325.0 188.0 127.0 9.0 11.0 180.0 183 bus
In [3]:
#Data type of variables in data
vehicle.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 19 columns):
compactness                    846 non-null int64
circularity                    841 non-null float64
distance_circularity           842 non-null float64
radius_ratio                   840 non-null float64
pr.axis_aspect_ratio           844 non-null float64
max.length_aspect_ratio        846 non-null int64
scatter_ratio                  845 non-null float64
elongatedness                  845 non-null float64
pr.axis_rectangularity         843 non-null float64
max.length_rectangularity      846 non-null int64
scaled_variance                843 non-null float64
scaled_variance.1              844 non-null float64
scaled_radius_of_gyration      844 non-null float64
scaled_radius_of_gyration.1    842 non-null float64
skewness_about                 840 non-null float64
skewness_about.1               845 non-null float64
skewness_about.2               845 non-null float64
hollows_ratio                  846 non-null int64
class                          846 non-null object
dtypes: float64(14), int64(4), object(1)
memory usage: 125.7+ KB
In [4]:
vehicle.shape
Out[4]:
(846, 19)
In [5]:
#there are many columns which contain missing value
vehicle.isna().sum()
Out[5]:
compactness                    0
circularity                    5
distance_circularity           4
radius_ratio                   6
pr.axis_aspect_ratio           2
max.length_aspect_ratio        0
scatter_ratio                  1
elongatedness                  1
pr.axis_rectangularity         3
max.length_rectangularity      0
scaled_variance                3
scaled_variance.1              2
scaled_radius_of_gyration      2
scaled_radius_of_gyration.1    4
skewness_about                 6
skewness_about.1               1
skewness_about.2               1
hollows_ratio                  0
class                          0
dtype: int64
In [6]:
vehicle.describe()
Out[6]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio
count 846.000000 841.000000 842.000000 840.000000 844.000000 846.000000 845.000000 845.000000 843.000000 846.000000 843.000000 844.000000 844.000000 842.000000 840.000000 845.000000 845.000000 846.000000
mean 93.678487 44.828775 82.110451 168.888095 61.678910 8.567376 168.901775 40.933728 20.582444 147.998818 188.631079 439.494076 174.709716 72.447743 6.364286 12.602367 188.919527 195.632388
std 8.234474 6.152172 15.778292 33.520198 7.891463 4.601217 33.214848 7.816186 2.592933 14.515652 31.411004 176.666903 32.584808 7.486190 4.920649 8.936081 6.155809 7.438797
min 73.000000 33.000000 40.000000 104.000000 47.000000 2.000000 112.000000 26.000000 17.000000 118.000000 130.000000 184.000000 109.000000 59.000000 0.000000 0.000000 176.000000 181.000000
25% 87.000000 40.000000 70.000000 141.000000 57.000000 7.000000 147.000000 33.000000 19.000000 137.000000 167.000000 318.000000 149.000000 67.000000 2.000000 5.000000 184.000000 190.250000
50% 93.000000 44.000000 80.000000 167.000000 61.000000 8.000000 157.000000 43.000000 20.000000 146.000000 179.000000 363.500000 173.500000 71.500000 6.000000 11.000000 188.000000 197.000000
75% 100.000000 49.000000 98.000000 195.000000 65.000000 10.000000 198.000000 46.000000 23.000000 159.000000 217.000000 587.000000 198.000000 75.000000 9.000000 19.000000 193.000000 201.000000
max 119.000000 59.000000 112.000000 333.000000 138.000000 55.000000 265.000000 61.000000 29.000000 188.000000 320.000000 1018.000000 268.000000 135.000000 22.000000 41.000000 206.000000 211.000000

Data Exploration

UNIVARIATE ANALYSIS

In [7]:
#compactness, circularity, distance_circularity, radius_ratio, elongatedness, max.length_rectangularity, 
#scaled_radius_of_gyration,skewness_about.2,hollows_ratio are approximately symmetric
#scatter_ratio, pr.axis_rectangularity, scaled_variance, scaled_variance.1, 
#skewness_about, skewness_about.1 are moderately right skewed
#pr.axis_aspect_ratio, max.length_aspect_ratio, scaled_radius_of_gyration.1 are highly right skewed
vehicle.skew()
Out[7]:
compactness                    0.381271
circularity                    0.261809
distance_circularity           0.106585
radius_ratio                   0.394978
pr.axis_aspect_ratio           3.830362
max.length_aspect_ratio        6.778394
scatter_ratio                  0.607271
elongatedness                  0.047847
pr.axis_rectangularity         0.770889
max.length_rectangularity      0.256359
scaled_variance                0.651598
scaled_variance.1              0.842034
scaled_radius_of_gyration      0.279317
scaled_radius_of_gyration.1    2.083496
skewness_about                 0.776519
skewness_about.1               0.688017
skewness_about.2               0.249321
hollows_ratio                 -0.226341
dtype: float64
In [8]:
#Compactness is aproxmiately symetric, its a mutlimodal distribution
sns.kdeplot(vehicle["compactness"])
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0xb632780>
In [9]:
#Circularity is aproxmiately symetric, its a mutlimodal distribution
sns.kdeplot(vehicle["circularity"])
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0xb9427f0>
In [10]:
#Distance_circularity contains mutlimodal distribution
sns.kdeplot(vehicle["distance_circularity"])
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0xb9c5518>
In [11]:
#radius ratio contains mutlimodal distribution. Contains many value distribution at peak
sns.kdeplot(vehicle["radius_ratio"])
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0xba28cc0>
In [12]:
#pr.axis_aspect_ratio is left skewed and has mutlimodal distribution
sns.kdeplot(vehicle["pr.axis_aspect_ratio"])
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0xba85ba8>
In [13]:
#max.length_aspect_ratio is left skewed and has bimodal distribution
sns.kdeplot(vehicle["max.length_aspect_ratio"])
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0xbace048>
In [14]:
#scatter_ratio has aprox symetric distribution with mutlimodal distribution
sns.kdeplot(vehicle["scatter_ratio"])
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0xbb2d0f0>
In [15]:
#elongatedness has aprox symetric distribution with mutlimodal distribution
sns.kdeplot(vehicle["elongatedness"])
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0xbb553c8>
In [16]:
#pr.axis_rectangularity is slightly left skewed with mutlimodal distribution
sns.kdeplot(vehicle["pr.axis_rectangularity"])
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0xbc1f6a0>
In [17]:
#max.length_rectangularity contains mutlimodal distribution
sns.kdeplot(vehicle["max.length_rectangularity"])
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0xbc7d748>
In [18]:
#scaled_variance is slightly left skewed with miltimodal distribution
sns.kdeplot(vehicle["scaled_variance"])
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0xbce6f28>
In [19]:
#scaled_variance.1 is left skewed with miltimodal distribution
sns.kdeplot(vehicle["scaled_variance.1"])
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0xbd4ce48>
In [20]:
#scaled_radius_of_gyration contains miltimodal distribution
sns.kdeplot(vehicle["scaled_radius_of_gyration"])
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0xbdb5860>
In [21]:
#scaled_radius_of_gyration.1 is left skewed and conatins multimodal distribution
sns.kdeplot(vehicle["scaled_radius_of_gyration.1"])
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0xbe0c3c8>
In [22]:
#skewness_about is left skewed with multimodal disrtibution
sns.kdeplot(vehicle["skewness_about"])
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0xbe73828>
In [23]:
#skewness_about.1 is left skewed with multimodal disrtibution
sns.kdeplot(vehicle["skewness_about.1"])
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x5bbf400>
In [24]:
#skewness_about.2 conatins multimodal disrtibution
sns.kdeplot(vehicle["skewness_about.2"])
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0xbf40710>
In [25]:
#hollows_ratio contains multimodal distibution
sns.kdeplot(vehicle["hollows_ratio"])
Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0xbe3ad68>
In [26]:
#Features is most cases have multimodal distribution
In [27]:
sns.boxplot(vehicle["compactness"])
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0xc011358>
In [28]:
sns.boxplot(vehicle["circularity"])
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0xd02c550>
In [29]:
sns.boxplot(vehicle["distance_circularity"])
Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0xd07d6a0>
In [30]:
#radius_ration has samll number of outliers
sns.boxplot(vehicle["radius_ratio"])
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0xc011710>
In [31]:
#pr.axis_aspect_ratio has outliers
sns.boxplot(vehicle["pr.axis_aspect_ratio"])
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0xd133080>
In [32]:
#max.length_aspect_ratio contains outliers
sns.boxplot(vehicle["max.length_aspect_ratio"])
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0xd1843c8>
In [33]:
sns.boxplot(vehicle["scatter_ratio"])
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0xd1de7b8>
In [34]:
sns.boxplot(vehicle["elongatedness"])
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0xd242748>
In [35]:
sns.boxplot(vehicle["pr.axis_rectangularity"])
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0xd298828>
In [36]:
sns.boxplot(vehicle["max.length_rectangularity"])
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0xd2f7e10>
In [37]:
sns.boxplot(vehicle["scaled_variance"])
Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0xd3503c8>
In [38]:
sns.boxplot(vehicle["scaled_variance.1"])
Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0xd3a2048>
In [39]:
sns.boxplot(vehicle["scaled_radius_of_gyration"])
Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0xd40a0b8>
In [40]:
#scaled_radius_of_gyration.1 contains outliers
sns.boxplot(vehicle["scaled_radius_of_gyration.1"])
Out[40]:
<matplotlib.axes._subplots.AxesSubplot at 0xd462588>
In [41]:
sns.boxplot(vehicle["skewness_about"])
Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0xd4c0828>
In [42]:
sns.boxplot(vehicle["skewness_about.1"])
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0xd5104e0>
In [43]:
sns.boxplot(vehicle["skewness_about.2"])
Out[43]:
<matplotlib.axes._subplots.AxesSubplot at 0xd56df98>
In [44]:
sns.boxplot(vehicle["hollows_ratio"])
Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0xd5c9dd8>
In [45]:
#Most of the feature distribution seems fine, while few conatin outliers
In [46]:
#Data has most cases of car, followed by bus and van
vehicle["class"].value_counts()
Out[46]:
car    429
bus    218
van    199
Name: class, dtype: int64

BIVARIATE ANALYSIS

In [47]:
#We can see features distributed on class have different mean values
vehicle.groupby(["class"]).agg('mean')
Out[47]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio
class
bus 91.591743 44.981308 76.767442 165.708333 63.414747 7.013761 170.022936 40.114679 20.580645 146.701835 192.889908 448.894495 181.032407 77.096774 4.794393 10.211009 187.811927 191.325688
car 96.184149 46.035047 88.878788 180.591549 60.992991 8.825175 181.053738 38.093458 21.511682 149.967366 197.894614 500.543326 179.613054 69.924883 7.126464 15.170561 189.474299 197.582751
van 90.562814 42.070352 73.247475 147.176768 61.261307 9.713568 141.537688 47.939698 18.575758 145.175879 163.964646 298.201005 157.276382 72.778894 6.417085 9.698492 188.939698 196.145729
In [48]:
#We can see features distributed on class have different median values
vehicle.groupby(["class"]).agg('median')
Out[48]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio
class
bus 89 44.0 72.0 167.5 64.0 6 152.0 44.0 19.0 145 177.0 344.0 176.0 76.0 5.0 10.0 186.0 189
car 97 46.0 94.0 186.0 61.0 9 185.0 36.0 22.0 150 206.0 512.0 182.0 70.0 6.0 14.0 189.0 198
van 90 42.0 75.0 144.0 59.0 9 142.0 47.0 18.0 145 164.0 300.0 159.0 72.0 6.0 9.0 188.0 196
In [49]:
## mean and median have seprable values for 3 given class of vehicle
## will use median to compute blank variable as data is skewed
In [50]:
sns.boxplot(x='class',y='compactness',data=vehicle)
Out[50]:
<matplotlib.axes._subplots.AxesSubplot at 0xd623470>
In [51]:
sns.boxplot(x='class',y='circularity',data=vehicle)
Out[51]:
<matplotlib.axes._subplots.AxesSubplot at 0xd6b1a58>
In [52]:
sns.boxplot(x='class',y='distance_circularity',data=vehicle)
Out[52]:
<matplotlib.axes._subplots.AxesSubplot at 0xd733be0>
In [53]:
sns.boxplot(x='class',y='radius_ratio',data=vehicle)
Out[53]:
<matplotlib.axes._subplots.AxesSubplot at 0xd7a8b70>
In [54]:
sns.boxplot(x='class',y='pr.axis_aspect_ratio',data=vehicle)
Out[54]:
<matplotlib.axes._subplots.AxesSubplot at 0xd623978>
In [55]:
sns.boxplot(x='class',y='max.length_aspect_ratio',data=vehicle)
Out[55]:
<matplotlib.axes._subplots.AxesSubplot at 0xd899b00>
In [56]:
sns.boxplot(x='class',y='scatter_ratio',data=vehicle)
Out[56]:
<matplotlib.axes._subplots.AxesSubplot at 0xd911fd0>
In [57]:
sns.boxplot(x='class',y='elongatedness',data=vehicle)
Out[57]:
<matplotlib.axes._subplots.AxesSubplot at 0xd890668>
In [58]:
sns.boxplot(x='class',y='pr.axis_rectangularity',data=vehicle)
Out[58]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9df908>
In [59]:
sns.boxplot(x='class',y='max.length_rectangularity',data=vehicle)
Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0xea64550>
In [60]:
sns.boxplot(x='class',y='scaled_variance',data=vehicle)
Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0xeae06a0>
In [61]:
sns.boxplot(x='class',y='scaled_variance.1',data=vehicle)
Out[61]:
<matplotlib.axes._subplots.AxesSubplot at 0xeb60c50>
In [62]:
sns.boxplot(x='class',y='scaled_radius_of_gyration',data=vehicle)
Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0xebd4e80>
In [63]:
sns.boxplot(x='class',y='scaled_radius_of_gyration.1',data=vehicle)
Out[63]:
<matplotlib.axes._subplots.AxesSubplot at 0xec328d0>
In [64]:
sns.boxplot(x='class',y='skewness_about',data=vehicle)
Out[64]:
<matplotlib.axes._subplots.AxesSubplot at 0xebd4828>
In [65]:
sns.boxplot(x='class',y='skewness_about.1',data=vehicle)
Out[65]:
<matplotlib.axes._subplots.AxesSubplot at 0xec57748>
In [66]:
sns.boxplot(x='class',y='skewness_about.2',data=vehicle)
Out[66]:
<matplotlib.axes._subplots.AxesSubplot at 0xedbfac8>
In [67]:
sns.boxplot(x='class',y='hollows_ratio',data=vehicle)
Out[67]:
<matplotlib.axes._subplots.AxesSubplot at 0xee39668>
In [68]:
#as stated above featues have different values distribution, thus the classes are easily seprable. However we cannot 
#distiguish among 2 cars.
In [69]:
## Filling blanks for respective col with median as per class
In [70]:
vehicle_col=['compactness', 'circularity', 'distance_circularity', 'radius_ratio', 'pr.axis_aspect_ratio', 
             'max.length_aspect_ratio', 'scatter_ratio','elongatedness', 'pr.axis_rectangularity', 
             'max.length_rectangularity', 'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration',
             'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1', 'skewness_about.2', 'hollows_ratio']

for item in vehicle_col:
    vehicle[item].fillna(vehicle.groupby('class')[item].transform('median'),inplace=True)
In [71]:
vehicle.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 19 columns):
compactness                    846 non-null int64
circularity                    846 non-null float64
distance_circularity           846 non-null float64
radius_ratio                   846 non-null float64
pr.axis_aspect_ratio           846 non-null float64
max.length_aspect_ratio        846 non-null int64
scatter_ratio                  846 non-null float64
elongatedness                  846 non-null float64
pr.axis_rectangularity         846 non-null float64
max.length_rectangularity      846 non-null int64
scaled_variance                846 non-null float64
scaled_variance.1              846 non-null float64
scaled_radius_of_gyration      846 non-null float64
scaled_radius_of_gyration.1    846 non-null float64
skewness_about                 846 non-null float64
skewness_about.1               846 non-null float64
skewness_about.2               846 non-null float64
hollows_ratio                  846 non-null int64
class                          846 non-null object
dtypes: float64(14), int64(4), object(1)
memory usage: 125.7+ KB
In [72]:
#Now there are no blanks in data
vehicle.isna().sum()
Out[72]:
compactness                    0
circularity                    0
distance_circularity           0
radius_ratio                   0
pr.axis_aspect_ratio           0
max.length_aspect_ratio        0
scatter_ratio                  0
elongatedness                  0
pr.axis_rectangularity         0
max.length_rectangularity      0
scaled_variance                0
scaled_variance.1              0
scaled_radius_of_gyration      0
scaled_radius_of_gyration.1    0
skewness_about                 0
skewness_about.1               0
skewness_about.2               0
hollows_ratio                  0
class                          0
dtype: int64
In [73]:
#Many features have high correlation value among each other
corr=vehicle.corr()
plt.subplots(figsize =(15, 10))
sns.heatmap(corr,cmap="YlGnBu",annot=True)
Out[73]:
<matplotlib.axes._subplots.AxesSubplot at 0xd350518>
In [74]:
#many features are correlated to each other is visible form the pairplot graph
sns.pairplot(vehicle)
Out[74]:
<seaborn.axisgrid.PairGrid at 0xf5997f0>
In [75]:
sns.pairplot(vehicle,hue='class')
Out[75]:
<seaborn.axisgrid.PairGrid at 0x1c9fdda0>
In [76]:
#There are few variables which exibhit weak realtionship
#There is no variable which clearly distincts between 3 groups, data seems to be overlapping. However there peak value, 
#or part of some distribution differs. Thus hope these features give good value when implemented in algorithm
#Many variables are highly correlated, thus can use PCA.
In [77]:
vehicle.groupby('class').quantile([.25, .5, .75, .90, .95, .99,1])
Out[77]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio
class
bus 0.25 85.00 42.00 69.00 139.25 56.00 6.0 149.00 36.00 19.0 142.00 170.00 327.00 163.25 71.00 2.00 5.00 181.00 184.00
0.50 89.00 44.00 72.00 167.50 64.00 6.0 152.00 44.00 19.0 145.00 177.00 344.00 176.00 76.00 5.00 10.00 186.00 189.00
0.75 98.00 47.00 80.00 188.00 69.00 7.0 177.75 45.00 21.0 149.75 203.75 486.50 190.75 82.00 7.00 14.00 195.00 198.75
0.90 105.00 53.00 98.00 203.00 72.00 8.0 236.00 46.00 26.0 164.00 256.60 825.30 229.30 85.00 9.00 20.00 198.00 203.00
0.95 107.15 55.15 103.15 209.00 74.00 8.0 250.15 46.00 27.0 169.00 272.45 923.75 247.45 87.00 10.00 22.15 200.00 205.00
0.99 110.83 57.00 106.83 237.49 75.83 22.0 260.83 46.00 28.0 172.00 285.00 986.15 261.83 88.00 12.83 27.83 201.83 206.83
1.00 114.00 58.00 109.00 252.00 126.00 52.0 265.00 47.00 29.0 175.00 288.00 1018.00 268.00 127.00 17.00 31.00 202.00 209.00
car 0.25 89.00 39.00 77.00 158.00 57.00 7.0 152.00 31.00 19.0 134.00 174.00 343.00 148.00 66.00 2.00 6.00 187.00 195.00
0.50 97.00 46.00 94.00 186.00 61.00 9.0 185.00 36.00 22.0 150.00 206.00 512.00 182.00 70.00 6.00 14.00 189.00 198.00
0.75 103.00 53.00 103.00 204.00 64.00 11.0 212.00 44.00 24.0 165.00 223.00 666.00 212.00 73.00 11.00 22.00 192.00 202.00
0.90 107.00 55.00 107.00 219.00 67.00 11.0 219.00 50.00 25.0 173.00 228.00 709.00 221.00 75.00 15.00 29.00 197.00 205.00
0.95 109.00 55.60 108.00 224.60 69.00 12.0 221.60 52.00 25.0 175.00 231.00 721.60 226.00 80.00 17.00 32.60 198.60 208.00
0.99 115.00 57.00 109.72 231.00 71.72 12.0 225.00 58.72 25.0 180.00 235.72 746.04 244.72 86.00 21.00 38.00 203.00 210.00
1.00 119.00 59.00 112.00 234.00 74.00 13.0 227.00 61.00 25.0 188.00 241.00 757.00 250.00 87.00 22.00 41.00 206.00 211.00
van 0.25 88.00 39.00 66.00 132.50 56.00 7.0 131.00 44.00 18.0 137.50 156.00 252.00 139.00 67.00 2.50 5.00 184.00 192.00
0.50 90.00 42.00 75.00 144.00 59.00 9.0 142.00 47.00 18.0 145.00 164.00 300.00 159.00 72.00 6.00 9.00 188.00 196.00
0.75 93.00 46.00 83.00 159.00 64.00 10.0 155.00 52.00 19.5 155.00 172.00 354.00 176.00 76.00 9.00 14.00 193.00 201.00
0.90 96.00 47.00 85.00 166.00 67.00 11.0 159.00 55.00 20.0 160.00 176.00 366.20 186.00 81.20 13.00 18.00 199.00 207.00
0.95 97.00 48.00 88.00 174.00 68.10 11.1 160.00 57.00 20.0 163.00 179.00 371.00 187.00 86.10 15.00 21.00 201.00 208.00
0.99 98.04 49.00 91.00 306.32 126.14 49.0 162.02 58.00 20.0 167.00 231.82 381.12 200.00 99.38 18.00 26.02 203.00 210.00
1.00 100.00 50.00 92.00 333.00 138.00 55.0 163.00 58.00 20.0 170.00 320.00 389.00 203.00 135.00 19.00 29.00 204.00 210.00
In [78]:
## There doesnt seem to be anything some unusual value in data, for now keeping data as is rather removing any outliers
In [79]:
#Seprating in X and Y
X_vehicle=vehicle.drop(['class'],1)
y_vehicle=vehicle['class']
print(X_vehicle.shape)
print(y_vehicle.shape)
(846, 18)
(846,)
In [80]:
#### Preprocessing of data for standardization
In [81]:
from sklearn.preprocessing import scale
X_std = scale(X_vehicle)
In [82]:
X_std=pd.DataFrame(X_std,columns=X_vehicle.columns)
X_std.head()
Out[82]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio
0 0.160580 0.517682 0.059308 0.271910 1.309886 0.311542 -0.208583 0.137295 -0.223684 0.758332 -0.403165 -0.343931 0.285523 -0.327232 -0.072834 0.380475 -0.312208 0.183957
1 -0.325470 -0.624110 0.122821 -0.835616 -0.594175 0.094079 -0.600383 0.521478 -0.609882 -0.344578 -0.594494 -0.621727 -0.513811 -0.059367 0.539264 0.156401 0.013073 0.452977
2 1.254193 0.843908 1.520094 1.199837 0.548262 0.311542 1.147649 -1.143315 0.934909 0.689401 1.095585 1.107410 1.392295 0.074565 1.559428 -0.403783 -0.149568 0.049447
3 -0.082445 -0.624110 -0.004204 -0.296819 0.167449 0.094079 -0.751075 0.649538 -0.609882 -0.344578 -0.913377 -0.740782 -1.466865 -1.264758 -0.072834 -0.291746 1.639478 1.529056
4 -1.054545 -0.134771 -0.766353 1.080105 5.244946 9.444962 -0.600383 0.521478 -0.609882 -0.275646 1.669575 -0.650073 0.408498 7.306912 0.539264 -0.179709 -1.450692 -1.699181

Test Train Split of the data

In [83]:
from sklearn.model_selection import train_test_split
In [84]:
X_train_std, X_test, y_train,y_test=train_test_split(X_std,y_vehicle,test_size=0.2,random_state=100)

Using PCA - Principal Component Analysis

In [85]:
from sklearn.decomposition import PCA
pca=PCA(svd_solver='randomized',random_state=10)
In [86]:
#Applying pca on train dataset
pca.fit(X_train_std)
Out[86]:
PCA(copy=True, iterated_power='auto', n_components=None, random_state=10,
  svd_solver='randomized', tol=0.0, whiten=False)
In [87]:
pca.components_
Out[87]:
array([[ 0.27438507,  0.29339069,  0.30735402,  0.26665802,  0.07932837,
         0.10342389,  0.31687352, -0.31225349,  0.31326375,  0.28445416,
         0.30773859,  0.31318119,  0.26987769, -0.02206269,  0.03551522,
         0.06888933,  0.03200495,  0.07729955],
       [-0.13982148,  0.12325101, -0.07458866, -0.1567036 , -0.04150534,
         0.10202929,  0.03578733,  0.02308899,  0.04813736,  0.11492449,
         0.06702455,  0.03952935,  0.20229574,  0.51432064, -0.07997852,
        -0.10655326, -0.54464927, -0.528934  ],
       [-0.09128275, -0.05088672, -0.04823387,  0.29001475,  0.64095314,
         0.5996713 , -0.10363019,  0.05931969, -0.11642331, -0.04038676,
         0.05528018, -0.11420611, -0.06339069,  0.23464534, -0.07516217,
        -0.05067782,  0.08906796,  0.10698775],
       [-0.07120124, -0.20573269,  0.07622259,  0.03869977, -0.04069875,
        -0.00580278,  0.09119521, -0.08207297,  0.08760835, -0.20186154,
         0.12404906,  0.08493155, -0.22392478,  0.0688637 , -0.55356038,
         0.69647229, -0.10049641, -0.03229015],
       [ 0.07924178, -0.07437295,  0.03773561, -0.05477097, -0.06573649,
         0.1910921 , -0.01940544,  0.08172437,  0.00216023, -0.04450995,
        -0.00823499, -0.02054047, -0.0614228 ,  0.13347126,  0.77288436,
         0.55321472, -0.08770053, -0.02686329],
       [ 0.10026501, -0.25241062, -0.14171494,  0.26489147,  0.27920477,
        -0.48002665,  0.10945804, -0.14460738,  0.09071462, -0.44244867,
         0.22338579,  0.14802759, -0.09688215,  0.23441574,  0.20674181,
        -0.16185707,  0.13955809, -0.25720686],
       [ 0.41656955, -0.2786799 ,  0.07788527, -0.14715033, -0.38939057,
         0.4531718 ,  0.0902063 , -0.01540989,  0.11604667, -0.17538598,
         0.14155751,  0.11306044, -0.38837886,  0.10620899, -0.03588224,
        -0.33877688,  0.01915502, -0.01151812],
       [-0.59242894, -0.15263555,  0.43181621,  0.12049407, -0.02991792,
         0.12631993,  0.09404785, -0.21518671,  0.0590771 , -0.2321766 ,
         0.0317381 ,  0.03122456, -0.09277475, -0.35125576,  0.16636374,
        -0.19134206, -0.30956721, -0.03546112],
       [-0.49010247, -0.01748784, -0.17357738, -0.21448272, -0.28777658,
         0.14245811,  0.06206487, -0.14232438,  0.01035399, -0.09577551,
         0.29621977,  0.08678094,  0.28262436,  0.30205183,  0.0275769 ,
         0.00578368,  0.50255388,  0.16468286],
       [-0.28457686,  0.09548712, -0.23528216, -0.04808722,  0.09588595,
        -0.10684651,  0.1778013 , -0.13153824,  0.20701254,  0.47399767,
        -0.12205989,  0.17034733, -0.6612455 ,  0.13350474,  0.08935511,
        -0.03957807,  0.04312112,  0.06912382],
       [ 0.0236006 ,  0.00918777,  0.69446632, -0.12823174,  0.03279552,
        -0.26967218, -0.1444449 , -0.01658899, -0.23903674,  0.12991894,
         0.07477168, -0.2088216 , -0.16392184,  0.46279442, -0.01159108,
        -0.05289406,  0.09809779,  0.16992491],
       [-0.00209999, -0.16093792, -0.06051254, -0.10790123,  0.09502122,
        -0.11453546,  0.10755227,  0.20489878,  0.30266525, -0.16892065,
        -0.18294794,  0.19113785,  0.18390702,  0.23832709,  0.0035656 ,
        -0.08245699, -0.37373237,  0.67515799],
       [-0.15791792,  0.08280993,  0.17814559,  0.15116511, -0.04151373,
        -0.0365796 , -0.06004093,  0.82475433,  0.20791357,  0.04579045,
         0.25420205,  0.25249196, -0.07845179, -0.06858452, -0.01340738,
        -0.02076823,  0.1463455 , -0.14194834],
       [-0.02193129,  0.23595821, -0.24982984,  0.49569229, -0.34229112,
        -0.0711337 , -0.08757349,  0.00550918, -0.2950865 , -0.01083649,
         0.41504172, -0.1742323 , -0.1583539 ,  0.06845883,  0.02768972,
        -0.02692114, -0.31206327,  0.29554483],
       [ 0.02687022, -0.58349143, -0.09924512, -0.22379668,  0.17484421,
        -0.08110687, -0.07418386,  0.01008576, -0.06470273,  0.44398367,
         0.5171422 , -0.07692342,  0.11278488, -0.20774142,  0.02893771,
        -0.01140978, -0.14974186,  0.04689426],
       [-0.05499917, -0.48251165,  0.04411619,  0.55942064, -0.30384628,
         0.00779882, -0.01159719,  0.00679325,  0.11419627,  0.3137102 ,
        -0.38087156, -0.08943054,  0.18987845,  0.18732553, -0.02282896,
        -0.00668356,  0.1170214 , -0.08231357],
       [ 0.00243792, -0.11119834,  0.02009347,  0.01515627,  0.00769509,
         0.01825721,  0.31100316,  0.07872963, -0.71464431,  0.06054793,
        -0.14077133,  0.58923014,  0.03849676,  0.00392619,  0.00136175,
         0.00412294, -0.02535848,  0.01144865],
       [-0.00288583, -0.00505168, -0.00357298, -0.03409898,  0.02752057,
        -0.00827723,  0.81754691,  0.22619323, -0.05634036, -0.02037527,
         0.02112198, -0.52163329,  0.0132804 , -0.00837149, -0.00283506,
        -0.00939403,  0.04067609, -0.01686211]])
In [88]:
pca.explained_variance_
Out[88]:
array([9.26096910e+00, 2.97004763e+00, 2.26832663e+00, 1.16402483e+00,
       9.75853089e-01, 5.44970697e-01, 3.69465451e-01, 2.29014273e-01,
       1.59251241e-01, 9.42116813e-02, 6.72511657e-02, 4.57683488e-02,
       3.34682906e-02, 2.61491813e-02, 2.02722374e-02, 1.66133975e-02,
       7.31836344e-03, 8.10135474e-04])
In [89]:
pca.explained_variance_ratio_
Out[89]:
array([5.07345119e-01, 1.62708584e-01, 1.24266093e-01, 6.37689543e-02,
       5.34603124e-02, 2.98552149e-02, 2.02404836e-02, 1.25461248e-02,
       8.72428563e-03, 5.16121327e-03, 3.68423113e-03, 2.50733461e-03,
       1.83349860e-03, 1.43253469e-03, 1.11057715e-03, 9.10134355e-04,
       4.00923050e-04, 4.43817784e-05])
In [90]:
colnames = list(X_vehicle.columns)
pcs_vehicle_df = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1],'PC3':pca.components_[2], 
                       'PC4':pca.components_[3], 'PC5':pca.components_[4], 'PC6':pca.components_[5],
                       'PC7':pca.components_[6], 'PC8':pca.components_[7], 'PC9':pca.components_[8],
                       'PC10':pca.components_[9], 'PC11':pca.components_[10], 'PC12':pca.components_[11], 
                       'PC13':pca.components_[12], 'PC14':pca.components_[13], 'PC15':pca.components_[14], 
                       'PC16':pca.components_[15], 'PC17':pca.components_[16],'PC18':pca.components_[17],'Feature':colnames})
pcs_vehicle_df.head()
Out[90]:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 Feature
0 0.274385 -0.139821 -0.091283 -0.071201 0.079242 0.100265 0.416570 -0.592429 -0.490102 -0.284577 0.023601 -0.002100 -0.157918 -0.021931 0.026870 -0.054999 0.002438 -0.002886 compactness
1 0.293391 0.123251 -0.050887 -0.205733 -0.074373 -0.252411 -0.278680 -0.152636 -0.017488 0.095487 0.009188 -0.160938 0.082810 0.235958 -0.583491 -0.482512 -0.111198 -0.005052 circularity
2 0.307354 -0.074589 -0.048234 0.076223 0.037736 -0.141715 0.077885 0.431816 -0.173577 -0.235282 0.694466 -0.060513 0.178146 -0.249830 -0.099245 0.044116 0.020093 -0.003573 distance_circularity
3 0.266658 -0.156704 0.290015 0.038700 -0.054771 0.264891 -0.147150 0.120494 -0.214483 -0.048087 -0.128232 -0.107901 0.151165 0.495692 -0.223797 0.559421 0.015156 -0.034099 radius_ratio
4 0.079328 -0.041505 0.640953 -0.040699 -0.065736 0.279205 -0.389391 -0.029918 -0.287777 0.095886 0.032796 0.095021 -0.041514 -0.342291 0.174844 -0.303846 0.007695 0.027521 pr.axis_aspect_ratio
In [91]:
#Making the screeplot - plotting the cumulative variance against the number of components
%matplotlib inline
fig = plt.figure(figsize = (10,6))
plt.plot(range(1,19,1),np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()
In [92]:
#Graph of individual explained variance and cumulative explained variance against Principal Components
fig = plt.figure(figsize = (10,6))
plt.bar(range(1,19), pca.explained_variance_ratio_, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(1,19),np.cumsum(pca.explained_variance_ratio_), where= 'mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc = 'best')
Out[92]:
<matplotlib.legend.Legend at 0x304c2400>
In [93]:
np.cumsum(pca.explained_variance_ratio_)
Out[93]:
array([0.50734512, 0.6700537 , 0.7943198 , 0.85808875, 0.91154906,
       0.94140428, 0.96164476, 0.97419089, 0.98291517, 0.98807638,
       0.99176062, 0.99426795, 0.99610145, 0.99753398, 0.99864456,
       0.9995547 , 0.99995562, 1.        ])
In [94]:
## 7 components explained almost 96% of varitation in data.
In [95]:
pca_final=PCA(n_components=7)
X_pca=pca_final.fit_transform(X_train_std)
X_train_pca=pd.DataFrame(X_pca,columns=['PC1','PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7'])
type(X_train_pca)
X_train_pca.shape
Out[95]:
(676, 7)
In [96]:
#Applying on test data
X_test_pca=pca_final.transform(X_test)
X_test_pca=pd.DataFrame(X_test_pca,columns=['PC1','PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7'])
type(X_test_pca)
X_test_pca.shape
Out[96]:
(170, 7)
In [97]:
## there is very low correlation among the PCA variables
plt.figure(figsize=(10,10))
print(sns.heatmap(X_train_pca.corr(),annot=True))
print(sns.heatmap(X_test_pca.corr(),annot=True))
AxesSubplot(0.125,0.125;0.62x0.755)
AxesSubplot(0.125,0.125;0.496x0.755)
In [98]:
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import validation_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
In [99]:
#Model Building using SVC - Support Vector Classifier
svc_1=SVC(C=1,kernel='linear',random_state=100)
svc_1.fit(X_train_std,y_train)

# predict
y_pred_test = svc_1.predict(X_test)
In [100]:
# confusion matrix
confusion = metrics.confusion_matrix(y_test, y_pred_test)
confusion
Out[100]:
array([[45,  2,  1],
       [ 4, 75,  1],
       [ 1,  1, 40]], dtype=int64)
In [101]:
# measure accuracy
metrics.accuracy_score(y_test, y_pred_test)
Out[101]:
0.9411764705882353
In [102]:
# class-wise result without pca
#precision - a class labelled as car turns is correct 96% of the time, similarly for other classes as well.
#recall - car is actually recognized as car 94% of time by algo, similarly for other classes as well.
#So model does a good job with accuracy of 94%
class_wise = metrics.classification_report(y_true=y_test, y_pred=y_pred_test)
print(class_wise)
              precision    recall  f1-score   support

         bus       0.90      0.94      0.92        48
         car       0.96      0.94      0.95        80
         van       0.95      0.95      0.95        42

   micro avg       0.94      0.94      0.94       170
   macro avg       0.94      0.94      0.94       170
weighted avg       0.94      0.94      0.94       170

In [103]:
#Using PCA with SVC
In [104]:
#Model Building
svc_1.fit(X_train_pca,y_train)

# predict
y_pred_test_pca = svc_1.predict(X_test_pca)
In [105]:
#confusiuon matrix
confusion_pca = metrics.confusion_matrix(y_test,y_pred_test_pca)
confusion_pca
Out[105]:
array([[36, 11,  1],
       [13, 61,  6],
       [ 1,  1, 40]], dtype=int64)
In [106]:
# measure accuracy
metrics.accuracy_score(y_test, y_pred_test_pca)
Out[106]:
0.8058823529411765
In [107]:
# class-wise result with pca - clearly results performance has decreased a lot. Maybe lets try increasing the PCA components
#from 7 to 9
class_wise_pca = metrics.classification_report(y_test, y_pred_test_pca)
print(class_wise_pca)
              precision    recall  f1-score   support

         bus       0.72      0.75      0.73        48
         car       0.84      0.76      0.80        80
         van       0.85      0.95      0.90        42

   micro avg       0.81      0.81      0.81       170
   macro avg       0.80      0.82      0.81       170
weighted avg       0.81      0.81      0.80       170

In [108]:
pca_final1=PCA(n_components=9)
X_pca1=pca_final1.fit_transform(X_train_std)
X_train_pca1=pd.DataFrame(X_pca1,columns=['PC1','PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9'])
#type(X_train_pca1)
#X_train_pca1.shape

#Applying on test data
X_test_pca1=pca_final1.transform(X_test)
X_test_pca1=pd.DataFrame(X_test_pca1,columns=['PC1','PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9'])
#type(X_test_pca1)
#X_test_pca1.shape

#Model Building
svc_1.fit(X_train_pca1,y_train)

# predict
y_pred_test_pca1 = svc_1.predict(X_test_pca1)
In [109]:
#confusiuon matrix
confusion_pca1 = metrics.confusion_matrix(y_test,y_pred_test_pca1)
confusion_pca1
Out[109]:
array([[43,  4,  1],
       [ 5, 69,  6],
       [ 0,  3, 39]], dtype=int64)
In [110]:
# measure accuracy
metrics.accuracy_score(y_test, y_pred_test_pca1)
Out[110]:
0.888235294117647
In [111]:
# class-wise
class_wise_pca1 = metrics.classification_report(y_test, y_pred_test_pca1)
print(class_wise_pca1)
              precision    recall  f1-score   support

         bus       0.90      0.90      0.90        48
         car       0.91      0.86      0.88        80
         van       0.85      0.93      0.89        42

   micro avg       0.89      0.89      0.89       170
   macro avg       0.88      0.90      0.89       170
weighted avg       0.89      0.89      0.89       170

In [112]:
##Clearly SVC without PCA gives better results with improvement in accuracy as well
In [113]:
#Using GridSearchCV first on SVC model
In [114]:
#Regularization using GridSearchCV
params = {"kernel": ['linear', 'rbf'],"gamma":[0.1, 1, 10, 100],'C': [0.5,1,10,100]}
In [115]:
from sklearn.model_selection import GridSearchCV
model_cv = GridSearchCV(estimator = svc_1, param_grid = params, 
                        scoring= 'accuracy',
                        cv=3,
                        verbose = 1,
                       return_train_score=True)  
In [116]:
model_cv.fit(X_train_std, y_train) 

# results of grid search CV
cv_results = pd.DataFrame(model_cv.cv_results_)
#cv_results


#parameters best value
best_score = model_cv.best_score_
best = model_cv.best_params_
best
Fitting 3 folds for each of 32 candidates, totalling 96 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed:    4.9s finished
Out[116]:
{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
In [117]:
#using best parameter values
model_best = SVC(kernel='rbf' ,gamma =0.1 ,C =10)
model_best.fit(X_train_std, y_train)
# predict
y_pred_best = model_best.predict(X_test)
In [118]:
#confusiuon matrix
confusion_gcv = metrics.confusion_matrix(y_test,y_pred_best)
confusion_gcv
Out[118]:
array([[47,  0,  1],
       [ 2, 76,  2],
       [ 0,  3, 39]], dtype=int64)
In [119]:
# measure accuracy
print("Accuracy: ",metrics.accuracy_score(y_test, y_pred_best))

# class-wise result with GridSearchCV selected model
class_wise_gcv = metrics.classification_report(y_test, y_pred_best)
print(class_wise_gcv)
Accuracy:  0.9529411764705882
              precision    recall  f1-score   support

         bus       0.96      0.98      0.97        48
         car       0.96      0.95      0.96        80
         van       0.93      0.93      0.93        42

   micro avg       0.95      0.95      0.95       170
   macro avg       0.95      0.95      0.95       170
weighted avg       0.95      0.95      0.95       170

In [120]:
#The model has improved with GridSearch. Now lets try GridSearch with 9 components of PCA
In [121]:
#Regularization using GridSearchCV
params = {"kernel": ['linear', 'rbf'],"gamma":[0.1, 1, 10, 100],'C': [0.5,1,10,100]}


model_cv1 = GridSearchCV(estimator = svc_1, param_grid = params, 
                        scoring= 'accuracy',
                        cv=3,
                        verbose = 1,
                       return_train_score=True)  

model_cv1.fit(X_train_pca1,y_train) 

# results of grid search CV
cv_results1 = pd.DataFrame(model_cv1.cv_results_)
#cv_results1


#parameters best value
best_score1 = model_cv1.best_score_
best1 = model_cv1.best_params_
best1
Fitting 3 folds for each of 32 candidates, totalling 96 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed:    6.7s finished
Out[121]:
{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
In [122]:
#using best parameter values
model_best1 = SVC(kernel='rbf' ,gamma =0.1 ,C =10)
model_best1.fit(X_train_pca1,y_train)
# predict
y_pred_best1 = model_best1.predict(X_test_pca1)
In [123]:
#confusiuon matrix
confusion_gcv1 = metrics.confusion_matrix(y_test,y_pred_best1)
confusion_gcv1
Out[123]:
array([[47,  0,  1],
       [ 3, 73,  4],
       [ 0,  3, 39]], dtype=int64)
In [124]:
# measure accuracy
metrics.accuracy_score(y_test, y_pred_best1)
Out[124]:
0.9352941176470588
In [125]:
# class-wise result
class_wise_gcv1 = metrics.classification_report(y_test, y_pred_best1)
print(class_wise_gcv1)
              precision    recall  f1-score   support

         bus       0.94      0.98      0.96        48
         car       0.96      0.91      0.94        80
         van       0.89      0.93      0.91        42

   micro avg       0.94      0.94      0.94       170
   macro avg       0.93      0.94      0.93       170
weighted avg       0.94      0.94      0.94       170

In [126]:
#When used GridSerach CV with PCA-9 componenets it gives compareable results when used GridSearch CV with svc algo only.
#We can consider this to be better as it uses less no of features thus making it less complex.
In [127]:
#The vehicles chosen are readily distinguishable bus, van and car.
#But there is no seperablity shown in data between 2 cars.
In [128]:
#GridSearch CV with SVC
#precision - a class labelled as car turns is correct 96% of the time, similarly for other classes as well.
#recall - car is actually recognized as car 95% of time by algo, similarly for other classes as well.
#So model does a good job with accuracy of 95.2%

#Accuracy:  0.9529411764705882
#              precision    recall  f1-score   support
#
#         bus       0.96      0.98      0.97        48
#         car       0.96      0.95      0.96        80
#         van       0.93      0.93      0.93        42

#   micro avg       0.95      0.95      0.95       170
#   macro avg       0.95      0.95      0.95       170
#weighted avg       0.95      0.95      0.95       170


#GridSearch CV with SVC and PCA-9
#precision - a class labelled as car turns is correct 96% of the time, similarly for other classes as well.
#recall - car is actually recognized as car 91% of time by algo, similarly for other classes as well.
#So model does a good job with accuracy of 93.5%

#Accuracy:  0.9352941176470588

# precision    recall  f1-score   support
#
#         bus       0.94      0.98      0.96        48
#         car       0.96      0.91      0.94        80
#         van       0.89      0.93      0.91        42

#   micro avg       0.94      0.94      0.94       170
#   macro avg       0.93      0.94      0.93       170
#weighted avg       0.94      0.94      0.94       170